#!/usr/bin/env python3

# SPDX-FileCopyrightText: 2020 Sebastian Wagner, 2023 Filip Pokorný
# SPDX-License-Identifier: AGPL-3.0-or-later

# This script generates the "event.md" documentation page.

import codecs
import json
import os.path

from ruamel.yaml import YAML

BASEDIR = os.path.join(os.path.dirname(__file__), '../')

yaml = YAML(typ="safe", pure=True)

HEADER = """\
<!-- comment
   SPDX-FileCopyrightText: 2015-2023 Sebastian Wagner, Filip Pokorný
   SPDX-License-Identifier: AGPL-3.0-or-later
   
   This document is automatically generated by `generate-event-docs.py` script.
-->

# Event

An event represents individual piece of data processed by IntelMQ. It uses JSON format.

Example Event:

```json
{
    "source.geolocation.cc": "JO",
    "malware.name": "qakbot",
    "source.ip": "82.212.115.188",
    "source.asn": 47887,
    "classification.type": "c2-server",
    "extra.status": "offline",
    "source.port": 443,
    "classification.taxonomy": "malicious-code",
    "source.geolocation.latitude": 31.9522,
    "feed.accuracy": 100,
    "extra.last_online": "2023-02-16",
    "time.observation": "2023-02-16T09:55:12+00:00",
    "source.geolocation.city": "amman",
    "source.network": "82.212.115.0/24",
    "time.source": "2023-02-15T14:19:09+00:00",
    "source.as_name": "NEU-AS",
    "source.geolocation.longitude": 35.939,
    "feed.name": "abusech-feodo-c2-tracker"
  }
```

## Minimum Requirements

Below, we have enumerated the minimum recommended requirements for an actionable abuse event. These keys should be
present for the abuse report to make sense for the end recipient. Please note that if you choose to anonymize your
sources, you can substitute **feed.name** with **feed.code**. At least one of the fields **ip**, **fqdn**, **url** or **account** should be present. All the rest of the keys are optional. This list of required fields is *not* enforced by IntelMQ.

| Field                   | Terminology |
| ----------------------- | ----------- |
| feed.name               | Should      |
| classification.type     | Should      |
| classification.taxonomy | Should      |
| time.source             | Should      |
| time.observation        | Should      |
| source.ip               | Should\*    |
| source.fqdn             | Should\*    |
| source.url              | Should\*    |
| source.account          | Should\*    |

\* at least one of them

## Classification

IntelMQ classifies events using three labels: `classification.taxonomy`, `classification.type` and `classification.identifier`. This tuple of three values can be used for deduplication of events and describes what happened.

The taxonomy can be automatically added by the taxonomy expert bot based on the given type. The following classification scheme loosely follows the [Reference Security Incident Taxonomy (RSIT)](https://github.com/enisaeu/Reference-Security-Incident-Taxonomy-Task-Force/):

| Classification Taxonomy | Classification Type | Description |
|----------------------|----------------------|----------------------|
| abusive-content      | harmful-speech       | Discreditation or discrimination of somebody, cyber stalking, racism or threats against one or more individuals.  |
| abusive-content      | spam                 |   Or 'Unsolicited Bulk Email', this means that the recipient has not granted verifiable permission for the message to be sent and that the message is sent as part of a larger collection of messages, all having a functionally comparable content. |
|   abusive-content    | violence             |   Child pornography, glorification of violence, etc.     |
|   availability       | ddos                 |   Distributed Denial of Service attack, e.g. SYN-Flood or UDP-based reflection/amplification attacks.           |
| availability       | dos                  | Denial of Service attack, e.g. sending specially crafted requests to a web application which causes the application to crash or slow down. |
| availability       | misconfiguration     | Software misconfiguration resulting in service availability issues, e.g. DNS server with outdated DNSSEC Root Zone KSK. |
| availability       | outage               | Outage caused e.g. by air condition failure or natural disaster. |
| availability       | sabotage             | Physical sabotage, e.g cutting wires or malicious arson. |
| fraud              | copyright            | Offering or Installing copies of unlicensed commercial software or other copyright protected materials (Warez). |
| fraud              | masquerade           | Type of attack in which one entity illegitimately impersonates the identity of another in order to benefit from it. |
| fraud              | phishing             | Masquerading as another entity in order to persuade the user to reveal private credentials. |
| fraud              | unauthorized-use-of-resources            | Using resources for unauthorized purposes including profit-making ventures, e.g. the use of e-mail to participate in illegal profit chain letters or pyramid schemes.   |
| information-content-security           | data-leak            | Leaked confidential information like credentials or personal data. |
| information-content-security           | data-loss            | Loss of data, e.g. caused by harddisk failure or physical theft.  |
| information-content-security           | unauthorised-information-access          | Unauthorized access to information, e.g. by abusing stolen login credentials for a system or application, intercepting traffic or gaining access to physical documents.         |
| information-content-security           | unauthorised-information-modification    | Unauthorised modification of information, e.g. by an attacker abusing stolen login credentials for a system or application or a ransomware encrypting data. |
| information-gathering                | scanner              | Attacks that send requests to a system to discover weaknesses. This also includes testing processes to gather information on hosts, services and accounts. Examples: fingerd, DNS querying, ICMP, SMTP (EXPN, RCPT, \...), port scanning. |
| information-gathering  | sniffing             | Observing and recording of network traffic (wiretapping). |
| information-gathering                  | social-engineering   | Gathering information from a human being in a non-technical way (e.g. lies, tricks, bribes, or threats). This IOC refers to a resource, which has been observed to perform brute-force attacks over a given application protocol.          |
| intrusion-attempts | brute-force          | Multiple login attempts (Guessing/cracking of passwords, brute force). |
| intrusion-attempts | exploit              | An attack using an unknown exploit.   |
| intrusion-attempts | ids-alert            | IOCs based on a sensor network. This is a generic IOC denomination, should it be difficult to reliably denote the exact type of activity involved for example due to an anecdotal nature of the rule that triggered the alert.             |
| intrusions         | application-compromise| Compromise of an application by exploiting (un)known software vulnerabilities, e.g. SQL injection.         |
| intrusions         | burglary             | Physical intrusion, e.g. into corporate building or data center.            |
| intrusions         | privileged-account-compromise            | Compromise of a system where the attacker gained administrative privileges.        |
| intrusions         | system-compromise    | Compromise of a system, e.g. unauthorised logins or commands. This includes compromising attempts on honeypot systems.  |
| intrusions         | unprivileged-account-compromise          | Compromise of a system using an unprivileged (user/service) account.           |
| malicious-code     | c2-server            | This is a command and control server in charge of a given number of botnet drones.     |
| malicious-code     | infected-system      | This is a compromised machine, which has been observed to make a connection to a command and control server.    |
| malicious-code     | malware-configuration | This is a resource which updates botnet drones with a new configuration.     |
| malicious-code     | malware-distribution | URI used for malware distribution, e.g. a download URL included in fake invoice malware spam.              |
| other              | blacklist            | Some sources provide blacklists, which clearly refer to abusive behavior, such as spamming, but fail to denote the exact reason why a given identity has been blacklisted. The reason may be that the justification is anecdotal or missing entirely. This type should only be used if the typing fits the definition of a blacklist, but an event specific denomination is not possible for one reason or another. Not in RSIT.              |
| other              | dga-domain           | DGA Domains are seen various families of malware that are used to periodically generate a large number of domain names that can be used as rendezvous points with their command and control servers. Not in RSIT.       |
| other              | other                | All incidents which don't fit in one of the given categories should be put into this class.        |
| other              | malware              | An IoC referring to a malware (sample) itself. Not in RSIT.       |
| other              | proxy                | This refers to the use of proxies from inside your network. Not in RSIT.              |
| test               | test                 | Meant for testing. Not in RSIT.       |
| other              | tor                  | This IOC refers to incidents related to TOR network infrastructure. Not in RSIT.       |
| other              | undetermined         | The categorisation of the incident is unknown/undetermined. |
| vulnerable         | ddos-amplifier       | Publicly accessible services that can be abused for conducting DDoS reflection/amplification attacks, e.g. DNS open-resolvers or NTP servers with monlist enabled.   |
| vulnerable         | information-disclosure                   | Publicly accessible services potentially disclosing sensitive information, e.g. SNMP or Redis.     |
| vulnerable         | potentially-unwanted-accessible          | Potentially unwanted publicly accessible services, e.g. Telnet, RDP or VNC.               |
| vulnerable         | vulnerable-system    | A system which is vulnerable to certain attacks. Example: misconfigured client proxy settings (example: WPAD), outdated operating system  version, etc.      |
| vulnerable         | weak-crypto          | Publicly accessible services offering weak crypto, e.g. web servers susceptible to POODLE/FREAK attacks.           |

## Meaning of source and destination identities

Meaning of source and destination identities for each `classification.type` can be different. Usually the main information is in the `source.*` fields.

The `classification.identifier` is often a normalized malware name, grouping many variants or the affected network protocol.

Examples of the meaning of the *source* and *destination* fields for various `classification.type` and possible identifiers are shown here.

| Classification Type   | Source                                 | Destination          | Possible Identifiers                 |
| --------------------- | -------------------------------------- | -------------------- | ------------------------------------ |
| blacklist             | blacklisted device                     |                      |                                      |
| brute-force           | attacker                               | target               |                                      |
| c2-server             | (sinkholed) c&c server                 |                      | zeus, palevo, feodo                  |
| ddos                  | attacker                               | target               |                                      |
| dga-domain            | infected device                        |                      |                                      |
| dropzone              | server hosting stolen data             |                      |                                      |
| exploit               | hosting server                         |                      |                                      |
| ids-alert             | triggering device                      |                      |                                      |
| infected-system       | infected device                        | contacted c&c server |                                      |
| malware               | infected device                        |                      | zeus, palevo, feodo                  |
| malware-configuration | infected device                        |                      |                                      |
| malware-distribution  | server hosting malware                 |                      |                                      |
| phishing              | phishing website                       |                      |                                      |
| proxy                 | server allowing policy/security bypass |                      |                                      |
| scanner               | scanning device                        | scanned device       | http, modbus, wordpress              |
| spam                  | infected device                        | targeted server      |                                      |
| system-compromise     | server                                 |                      |                                      |
| vulnerable-system     | vulnerable device                      |                      | heartbleed, openresolver, snmp, wpad |

Examples:

- If an event describes IP address that connects to a zeus command and control server, it's about the infected device. Therefore the `classification.taxonomy` is `malicious-code`, `classification.type` is `infected-system` and the `classification.identifier` is `zeus`.

- If an event describes IP address where a command and control server is running, the event's
`classification.type` is `c2server`. The `malware.name` can have the full name, eg. `zeus_p2p`.
  
## Additional Information

Information that do not fit into any of the event fields should be placed in the `extra` namespace.Therefore the keys must be prefixed `extra.` string. There are no other rules on key names and values for additional information.

## Fields Reference

Here you can find detailed information about all the possible fields used in an event.


"""


def info(key, value=""):
    return f"**{key.title()}:** {str(value).strip()}\n\n"


def main():
    output = HEADER

    with codecs.open(os.path.join(BASEDIR, 'intelmq/etc/harmonization.conf'), encoding='utf-8') as f:
        harmonization = json.load(f)['event']

    for key, value in sorted(harmonization.items()):
        # output += '| {:32} | {:27} | {} |\n'.format(key,
        #                                             f"[{value['type']}](#{value['type'].lower()})",
        #                                             value['description'])
        output += f"""### `{key}` <div id="{key}" />\n\n"""
        output += f"**Type:** [{value['type']}](#{value['type'].lower()})\n\n"
        output += value['description']
        output += "\n\n"

    return output


if __name__ == '__main__':  # pragma: no cover

    with codecs.open(os.path.join(BASEDIR, 'docs/user/event.md'), 'w', encoding='utf-8') as f:
        f.write(main())
